library(glmnet)
library(tidyverse)
library(magrittr)
library(pROC)
library(ROCR)
library(MASS)
library(ROSE)
library(caret)
library(leaps)
library(sjPlot)
library(sjlabelled)
library(sjmisc)
#read in full data set
df_simple = read.csv('./bank-full.csv')
#Converting to Factors
df_simple$y <- factor(df_simple$y)
df_simple$job = factor(df_simple$job)
df_simple$marital = factor(df_simple$marital)
df_simple$education = factor(df_simple$education)
df_simple$default = factor(df_simple$default)
df_simple$housing = factor(df_simple$housing)
df_simple$loan = factor(df_simple$loan)
df_simple$contact = factor(df_simple$contact)
df_simple$poutcome = factor(df_simple$poutcome)
df_simple$month = factor(df_simple$month)
#create test train split
set.seed(1)
simple_test_index <- sample(1:nrow(df_simple), size = 0.2*nrow(df_simple))
test_simple <- df_simple[simple_test_index,]
train_simple<- df_simple[-simple_test_index,]
#using clean data set from GitHub
df = read.csv('./clean.csv')
#head(df)
#converting to factors
df$y = as.factor(df$y)
df$job = factor(df$job)
df$marital = factor(df$marital)
df$education = factor(df$education)
df$default = factor(df$default)
df$housing = factor(df$housing)
df$loan = factor(df$loan)
df$contact = factor(df$contact)
df$poutcome = factor(df$poutcome)
df$month = factor(df$month)
#create test train split
set.seed(1)
#balance train data set
test_index <- sample(1:nrow(df), size = 0.2*nrow(df))
test <- df[test_index,]
train<- df[-test_index,]
both_train <- ovun.sample(y~., data=train, method="both")
both_train <- both_train$data
table(both_train$y)
##
## no yes
## 14888 15062
#defining response variable
train_resp = train$y
test_resp = as.factor(test$y)
#defining explanatory variables
train_expl = data.matrix(train[,1:16])
test_expl = data.matrix(test[,1:16])
#unclean data proportions
writeLines("Proportion of No to Yes in Orignial Data")
## Proportion of No to Yes in Orignial Data
prop.table(table(df_simple$y))
##
## no yes
## 0.8830152 0.1169848
writeLines('')
writeLines("Proportion of No to Yes in Test Split from Original Data")
## Proportion of No to Yes in Test Split from Original Data
prop.table(table(test_simple$y))
##
## no yes
## 0.8786773 0.1213227
#clean data proporation
writeLines("Proportion of No to Yes in Clean/Transformed Data")
## Proportion of No to Yes in Clean/Transformed Data
prop.table(table(df$y))
##
## no yes
## 0.8730133 0.1269867
writeLines('')
writeLines("Proportion of No to Yes in Clean/Transformed Data with Balancing")
## Proportion of No to Yes in Clean/Transformed Data with Balancing
prop.table(table(both_train$y))
##
## no yes
## 0.4970952 0.5029048
#create simple model
slr = glm(y ~ ., data = train_simple, family = "binomial")
summary(slr)
##
## Call:
## glm(formula = y ~ ., family = "binomial", data = train_simple)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.9282 -0.3754 -0.2550 -0.1523 3.4323
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.508e+00 2.046e-01 -12.258 < 2e-16 ***
## age 2.548e-04 2.464e-03 0.103 0.917646
## jobblue-collar -3.331e-01 8.091e-02 -4.116 3.85e-05 ***
## jobentrepreneur -4.834e-01 1.438e-01 -3.362 0.000774 ***
## jobhousemaid -6.201e-01 1.544e-01 -4.016 5.92e-05 ***
## jobmanagement -1.701e-01 8.177e-02 -2.081 0.037450 *
## jobretired 1.751e-01 1.092e-01 1.603 0.108850
## jobself-employed -3.496e-01 1.259e-01 -2.776 0.005497 **
## jobservices -2.952e-01 9.446e-02 -3.125 0.001779 **
## jobstudent 3.225e-01 1.234e-01 2.614 0.008956 **
## jobtechnician -2.180e-01 7.698e-02 -2.832 0.004621 **
## jobunemployed -1.770e-01 1.244e-01 -1.424 0.154578
## jobunknown -5.907e-02 2.434e-01 -0.243 0.808243
## maritalmarried -1.522e-01 6.624e-02 -2.298 0.021541 *
## maritalsingle 1.316e-01 7.549e-02 1.744 0.081211 .
## educationsecondary 1.217e-01 7.138e-02 1.705 0.088176 .
## educationtertiary 2.990e-01 8.339e-02 3.585 0.000337 ***
## educationunknown 2.032e-01 1.155e-01 1.759 0.078631 .
## defaultyes -1.062e-01 1.839e-01 -0.578 0.563480
## balance 1.153e-05 5.629e-06 2.048 0.040585 *
## housingyes -6.549e-01 4.901e-02 -13.363 < 2e-16 ***
## loanyes -3.440e-01 6.567e-02 -5.239 1.62e-07 ***
## contacttelephone -2.112e-01 8.460e-02 -2.497 0.012540 *
## contactunknown -1.592e+00 8.137e-02 -19.560 < 2e-16 ***
## day 9.878e-03 2.796e-03 3.532 0.000412 ***
## monthaug -6.290e-01 8.786e-02 -7.159 8.13e-13 ***
## monthdec 7.345e-01 1.976e-01 3.717 0.000202 ***
## monthfeb -9.934e-02 1.000e-01 -0.993 0.320628
## monthjan -1.222e+00 1.360e-01 -8.984 < 2e-16 ***
## monthjul -7.983e-01 8.712e-02 -9.163 < 2e-16 ***
## monthjun 4.926e-01 1.042e-01 4.727 2.27e-06 ***
## monthmar 1.606e+00 1.348e-01 11.914 < 2e-16 ***
## monthmay -3.971e-01 8.105e-02 -4.900 9.58e-07 ***
## monthnov -8.067e-01 9.386e-02 -8.595 < 2e-16 ***
## monthoct 9.358e-01 1.217e-01 7.687 1.50e-14 ***
## monthsep 7.963e-01 1.371e-01 5.810 6.25e-09 ***
## duration 4.225e-03 7.285e-05 58.000 < 2e-16 ***
## campaign -9.061e-02 1.124e-02 -8.060 7.62e-16 ***
## pdays -1.952e-04 3.514e-04 -0.555 0.578576
## previous 8.261e-03 6.394e-03 1.292 0.196344
## poutcomeother 2.337e-01 1.002e-01 2.333 0.019648 *
## poutcomesuccess 2.278e+00 9.230e-02 24.678 < 2e-16 ***
## poutcomeunknown -1.185e-01 1.049e-01 -1.130 0.258514
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25946 on 36168 degrees of freedom
## Residual deviance: 17268 on 36126 degrees of freedom
## AIC: 17354
##
## Number of Fisher Scoring iterations: 6
#CI and Odds Ratio
exp(cbind(OR = coef(slr), confint(slr)))
## OR 2.5 % 97.5 %
## (Intercept) 0.08146003 0.05449355 0.1215207
## age 1.00025483 0.99542991 1.0050924
## jobblue-collar 0.71671807 0.61164316 0.8399795
## jobentrepreneur 0.61668846 0.46283882 0.8135720
## jobhousemaid 0.53787135 0.39522358 0.7242655
## jobmanagement 0.84354388 0.71888612 0.9905529
## jobretired 1.19135111 0.96140278 1.4751354
## jobself-employed 0.70496348 0.54926518 0.8999904
## jobservices 0.74438926 0.61797162 0.8950207
## jobstudent 1.38061574 1.08275410 1.7565309
## jobtechnician 0.80410087 0.69161212 0.9352702
## jobunemployed 0.83775803 0.65487550 1.0664679
## jobunknown 0.94263760 0.57586557 1.4977698
## maritalmarried 0.85877370 0.75486413 0.9787217
## maritalsingle 1.14069056 0.98442411 1.3234801
## educationsecondary 1.12943593 0.98267036 1.3000297
## educationtertiary 1.34846024 1.14571646 1.5887769
## educationunknown 1.22527426 0.97543847 1.5343390
## defaultyes 0.89923040 0.61811996 1.2726620
## balance 1.00001153 1.00000029 1.0000224
## housingyes 0.51948490 0.47181741 0.5717653
## loanyes 0.70889846 0.62251575 0.8053401
## contacttelephone 0.80960549 0.68466945 0.9539734
## contactunknown 0.20361164 0.17343521 0.2386078
## day 1.00992653 1.00440654 1.0154775
## monthaug 0.53311478 0.44884076 0.6334263
## monthdec 2.08445012 1.41218540 3.0662383
## monthfeb 0.90543540 0.74402152 1.1012781
## monthjan 0.29470002 0.22488150 0.3833568
## monthjul 0.45009364 0.37943073 0.5339145
## monthjun 1.63650295 1.33432500 2.0075304
## monthmar 4.98074485 3.82445399 6.4874834
## monthmay 0.67223651 0.57368770 0.7882740
## monthnov 0.44630825 0.37110619 0.5361949
## monthoct 2.54917636 2.00725797 3.2350693
## monthsep 2.21735747 1.69391487 2.8992936
## duration 1.00423441 1.00409188 1.0043787
## campaign 0.91337835 0.89315095 0.9333850
## pdays 0.99980482 0.99911377 1.0004913
## previous 1.00829532 0.99502452 1.0235788
## poutcomeother 1.26327446 1.03693016 1.5358046
## poutcomesuccess 9.75413575 8.14662209 11.6983358
## poutcomeunknown 0.88823582 0.72384161 1.0921812
#predictions
slrPredict = predict(slr,newdata = test_simple,type = 'response')
plot(slrPredict,
col = test_simple$y,
main = 'Simple Log Regression Predictions',
ylab = 'Prediction',
xlab = 'Data Index')

slrPredictnew<-factor(ifelse(as.numeric(slrPredict)>.5,'yes','no' ) ,levels = c('no','yes'))
confusionMatrix(slrPredictnew,test_simple$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7751 679
## yes 194 418
##
## Accuracy : 0.9035
## 95% CI : (0.8972, 0.9095)
## No Information Rate : 0.8787
## P-Value [Acc > NIR] : 5.575e-14
##
## Kappa : 0.4406
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9756
## Specificity : 0.3810
## Pos Pred Value : 0.9195
## Neg Pred Value : 0.6830
## Prevalence : 0.8787
## Detection Rate : 0.8572
## Detection Prevalence : 0.9323
## Balanced Accuracy : 0.6783
##
## 'Positive' Class : no
##
#Pearson Chi^2 Test
sum(residuals(slr, type = "pearson")^2)
## [1] 411212.5
#Plotting Odds Ratios
plot_model(slr,sort.est = TRUE,grid = TRUE,title = 'Odds Ratio for Explanitory Variables: SLR')

#create simple model
slr_clean = glm(y ~ ., data = train, family = "binomial")
summary(slr_clean)
##
## Call:
## glm(formula = y ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9889 -0.3955 -0.2112 -0.0950 4.0799
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.171e+01 2.972e-01 -39.393 < 2e-16 ***
## age -1.743e-03 2.638e-03 -0.661 0.50878
## jobblue-collar -3.700e-01 8.781e-02 -4.213 2.52e-05 ***
## jobentrepreneur -4.320e-01 1.542e-01 -2.801 0.00509 **
## jobhousemaid -4.071e-01 1.592e-01 -2.557 0.01057 *
## jobmanagement -1.290e-01 8.895e-02 -1.451 0.14685
## jobretired 1.694e-01 1.169e-01 1.448 0.14758
## jobself-employed -3.460e-01 1.320e-01 -2.621 0.00878 **
## jobservices -2.932e-01 1.021e-01 -2.871 0.00409 **
## jobstudent 2.784e-01 1.329e-01 2.095 0.03620 *
## jobtechnician -1.641e-01 8.340e-02 -1.967 0.04914 *
## jobunemployed -1.913e-01 1.316e-01 -1.454 0.14604
## jobunknown -2.584e-01 2.809e-01 -0.920 0.35768
## maritalmarried -2.053e-01 7.206e-02 -2.849 0.00439 **
## maritalsingle 5.425e-02 8.176e-02 0.663 0.50703
## educationsecondary 1.197e-01 7.645e-02 1.566 0.11733
## educationtertiary 2.717e-01 8.908e-02 3.050 0.00229 **
## educationunknown 1.485e-01 1.235e-01 1.203 0.22911
## defaultyes -5.871e-03 3.221e-01 -0.018 0.98546
## balance 5.783e-02 1.441e-02 4.012 6.01e-05 ***
## housingyes -6.791e-01 5.238e-02 -12.965 < 2e-16 ***
## loanyes -3.749e-01 7.390e-02 -5.073 3.91e-07 ***
## contacttelephone -1.000e-01 9.220e-02 -1.085 0.27802
## contactunknown -1.572e+00 8.583e-02 -18.312 < 2e-16 ***
## day 1.305e-02 3.022e-03 4.319 1.57e-05 ***
## monthaug -5.113e-01 9.453e-02 -5.408 6.36e-08 ***
## monthdec 8.351e-01 2.083e-01 4.009 6.10e-05 ***
## monthfeb 2.223e-02 1.073e-01 0.207 0.83589
## monthjan -1.146e+00 1.431e-01 -8.011 1.14e-15 ***
## monthjul -7.486e-01 9.393e-02 -7.970 1.58e-15 ***
## monthjun 6.665e-01 1.123e-01 5.936 2.92e-09 ***
## monthmar 1.898e+00 1.489e-01 12.749 < 2e-16 ***
## monthmay -2.234e-01 8.674e-02 -2.575 0.01002 *
## monthnov -7.434e-01 9.954e-02 -7.469 8.08e-14 ***
## monthoct 1.009e+00 1.335e-01 7.558 4.09e-14 ***
## monthsep 1.009e+00 1.504e-01 6.710 1.94e-11 ***
## duration 1.823e+00 3.343e-02 54.536 < 2e-16 ***
## campaign -6.605e-02 1.218e-02 -5.423 5.86e-08 ***
## pdays -3.582e-04 3.724e-04 -0.962 0.33624
## previous 7.397e-03 6.701e-03 1.104 0.26964
## poutcomeother 2.190e-01 1.098e-01 1.995 0.04605 *
## poutcomesuccess 2.373e+00 1.004e-01 23.636 < 2e-16 ***
## poutcomeunknown -7.274e-02 1.118e-01 -0.651 0.51528
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22686 on 29949 degrees of freedom
## Residual deviance: 14468 on 29907 degrees of freedom
## AIC: 14554
##
## Number of Fisher Scoring iterations: 6
#CI and Odds Ratio
exp(cbind(OR = coef(slr_clean), confint(slr_clean)))
## OR 2.5 % 97.5 %
## (Intercept) 8.238035e-06 4.586338e-06 1.470443e-05
## age 9.982586e-01 9.931047e-01 1.003428e+00
## jobblue-collar 6.907386e-01 5.815715e-01 8.206011e-01
## jobentrepreneur 6.492240e-01 4.774057e-01 8.742601e-01
## jobhousemaid 6.655822e-01 4.847676e-01 9.053471e-01
## jobmanagement 8.789321e-01 7.385848e-01 1.046786e+00
## jobretired 1.184535e+00 9.415751e-01 1.489288e+00
## jobself-employed 7.075114e-01 5.448579e-01 9.144445e-01
## jobservices 7.458983e-01 6.100351e-01 9.104335e-01
## jobstudent 1.321072e+00 1.017118e+00 1.712901e+00
## jobtechnician 8.486758e-01 7.208675e-01 9.996945e-01
## jobunemployed 8.258990e-01 6.369023e-01 1.067015e+00
## jobunknown 7.722928e-01 4.370054e-01 1.317924e+00
## maritalmarried 8.144410e-01 7.077684e-01 9.388294e-01
## maritalsingle 1.055745e+00 8.999763e-01 1.240091e+00
## educationsecondary 1.127181e+00 9.710105e-01 1.310370e+00
## educationtertiary 1.312200e+00 1.102556e+00 1.563396e+00
## educationunknown 1.160081e+00 9.092406e-01 1.475521e+00
## defaultyes 9.941458e-01 5.066015e-01 1.805444e+00
## balance 1.059530e+00 1.030087e+00 1.089963e+00
## housingyes 5.070777e-01 4.575137e-01 5.618063e-01
## loanyes 6.873349e-01 5.937521e-01 7.933379e-01
## contacttelephone 9.048262e-01 7.540019e-01 1.082362e+00
## contactunknown 2.076993e-01 1.753722e-01 2.455365e-01
## day 1.013138e+00 1.007156e+00 1.019160e+00
## monthaug 5.997309e-01 4.983613e-01 7.219516e-01
## monthdec 2.304960e+00 1.531320e+00 3.467470e+00
## monthfeb 1.022479e+00 8.283444e-01 1.261651e+00
## monthjan 3.178169e-01 2.392228e-01 4.193238e-01
## monthjul 4.730108e-01 3.934298e-01 5.685986e-01
## monthjun 1.947429e+00 1.563029e+00 2.427478e+00
## monthmar 6.672033e+00 4.985704e+00 8.938227e+00
## monthmay 7.998143e-01 6.749911e-01 9.484193e-01
## monthnov 4.754843e-01 3.910070e-01 5.776678e-01
## monthoct 2.743465e+00 2.111369e+00 3.564041e+00
## monthsep 2.743562e+00 2.043306e+00 3.685271e+00
## duration 6.191233e+00 5.801204e+00 6.613618e+00
## campaign 9.360862e-01 9.136403e-01 9.583175e-01
## pdays 9.996419e-01 9.989099e-01 1.000370e+00
## previous 1.007424e+00 9.936424e-01 1.024152e+00
## poutcomeother 1.244848e+00 1.002730e+00 1.542196e+00
## poutcomesuccess 1.072602e+01 8.818966e+00 1.307187e+01
## poutcomeunknown 9.298421e-01 7.475710e-01 1.158990e+00
#predictions
slrPredict_clean = predict(slr_clean,newdata = test,type = 'response')
plot(slrPredict_clean,
col = test$y,
main = 'Simple Log Regression Predictions',
ylab = 'Prediction',
xlab = 'Data Index')

slrPredictnew_clean<-factor(ifelse(as.numeric(slrPredict_clean)>.5,'yes','no' ) ,levels = c('no','yes'))
confusionMatrix(slrPredictnew_clean,test$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 6332 621
## yes 175 359
##
## Accuracy : 0.8937
## 95% CI : (0.8865, 0.9006)
## No Information Rate : 0.8691
## P-Value [Acc > NIR] : 5.054e-11
##
## Kappa : 0.4208
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9731
## Specificity : 0.3663
## Pos Pred Value : 0.9107
## Neg Pred Value : 0.6723
## Prevalence : 0.8691
## Detection Rate : 0.8457
## Detection Prevalence : 0.9287
## Balanced Accuracy : 0.6697
##
## 'Positive' Class : no
##
#Pearson Chi^2 Test
sum(residuals(slr_clean, type = "pearson")^2)
## [1] 37392.8
#Plotting Odds Ratios
plot_model(slr_clean,sort.est = TRUE,grid = TRUE,title = 'Odds Ratio for Explanitory Variables: SLR Clean')

# use full model for stepwise feature selection
StepwiseFeature =step(slr_clean,direction="both",trace=0)
summary(StepwiseFeature)
##
## Call:
## glm(formula = y ~ job + marital + education + balance + housing +
## loan + contact + day + month + duration + campaign + poutcome,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9765 -0.3961 -0.2115 -0.0953 4.0865
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.845378 0.259682 -45.615 < 2e-16 ***
## jobblue-collar -0.369623 0.087719 -4.214 2.51e-05 ***
## jobentrepreneur -0.433791 0.154128 -2.814 0.00489 **
## jobhousemaid -0.414508 0.158900 -2.609 0.00909 **
## jobmanagement -0.129444 0.088908 -1.456 0.14541
## jobretired 0.138398 0.105632 1.310 0.19013
## jobself-employed -0.344286 0.131964 -2.609 0.00908 **
## jobservices -0.292437 0.102061 -2.865 0.00417 **
## jobstudent 0.297777 0.130611 2.280 0.02261 *
## jobtechnician -0.162431 0.083367 -1.948 0.05137 .
## jobunemployed -0.192938 0.131591 -1.466 0.14260
## jobunknown -0.264413 0.280575 -0.942 0.34599
## maritalmarried -0.199212 0.071755 -2.776 0.00550 **
## maritalsingle 0.074076 0.076957 0.963 0.33577
## educationsecondary 0.124322 0.075974 1.636 0.10176
## educationtertiary 0.280224 0.088256 3.175 0.00150 **
## educationunknown 0.148184 0.123472 1.200 0.23009
## balance 0.057168 0.014266 4.007 6.14e-05 ***
## housingyes -0.678794 0.051923 -13.073 < 2e-16 ***
## loanyes -0.373466 0.073838 -5.058 4.24e-07 ***
## contacttelephone -0.110210 0.091217 -1.208 0.22696
## contactunknown -1.568797 0.085728 -18.300 < 2e-16 ***
## day 0.013005 0.003022 4.304 1.68e-05 ***
## monthaug -0.509909 0.094380 -5.403 6.56e-08 ***
## monthdec 0.837434 0.208387 4.019 5.85e-05 ***
## monthfeb 0.028521 0.107168 0.266 0.79014
## monthjan -1.138658 0.142940 -7.966 1.64e-15 ***
## monthjul -0.745257 0.093886 -7.938 2.06e-15 ***
## monthjun 0.668745 0.112267 5.957 2.57e-09 ***
## monthmar 1.899374 0.148830 12.762 < 2e-16 ***
## monthmay -0.224068 0.086669 -2.585 0.00973 **
## monthnov -0.733985 0.098962 -7.417 1.20e-13 ***
## monthoct 1.010089 0.133336 7.576 3.58e-14 ***
## monthsep 1.008682 0.150363 6.708 1.97e-11 ***
## duration 1.822089 0.033407 54.542 < 2e-16 ***
## campaign -0.065978 0.012176 -5.419 6.01e-08 ***
## poutcomeother 0.236366 0.109131 2.166 0.03032 *
## poutcomesuccess 2.395740 0.097635 24.538 < 2e-16 ***
## poutcomeunknown -0.010416 0.069424 -0.150 0.88074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22686 on 29949 degrees of freedom
## Residual deviance: 14471 on 29911 degrees of freedom
## AIC: 14549
##
## Number of Fisher Scoring iterations: 6
#stepwise feature selection model
slr_fsr = glm(y ~ job+marital+education+balance+housing+loan+contact+day+month+duration+campaign+pdays+poutcome, family = 'binomial',data = train)
#predictions from stepwise output
slrPredict_fsr = predict(slr_fsr,newdata = test,type = 'response')
plot(slrPredict_fsr,
col = test$y,
main = 'Simple Log Regression Predictions: Stepwise',
ylab = 'Prediction',
xlab = 'Data Index')

slrPredictnew_fsr<-factor(ifelse(as.numeric(slrPredict_fsr)>.5,'yes','no' ) ,levels = c('no','yes'))
confusionMatrix(slrPredictnew_fsr,test$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 6331 623
## yes 176 357
##
## Accuracy : 0.8933
## 95% CI : (0.8861, 0.9002)
## No Information Rate : 0.8691
## P-Value [Acc > NIR] : 1.033e-10
##
## Kappa : 0.4183
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9730
## Specificity : 0.3643
## Pos Pred Value : 0.9104
## Neg Pred Value : 0.6698
## Prevalence : 0.8691
## Detection Rate : 0.8456
## Detection Prevalence : 0.9288
## Balanced Accuracy : 0.6686
##
## 'Positive' Class : no
##
#Pearson Chi^2 Test
sum(residuals(slr_fsr, type = "pearson")^2)
## [1] 37457.62
#CI and Odds Ratio
exp(cbind(OR = coef(slr_fsr), confint(slr_fsr)))
## OR 2.5 % 97.5 %
## (Intercept) 7.804253e-06 4.554068e-06 1.328962e-05
## jobblue-collar 6.913510e-01 5.821975e-01 8.211762e-01
## jobentrepreneur 6.472094e-01 4.759785e-01 8.714525e-01
## jobhousemaid 6.595975e-01 4.807179e-01 8.966372e-01
## jobmanagement 8.783147e-01 7.381404e-01 1.045948e+00
## jobretired 1.145135e+00 9.305747e-01 1.408334e+00
## jobself-employed 7.077094e-01 5.450623e-01 9.146071e-01
## jobservices 7.469701e-01 6.109913e-01 9.116240e-01
## jobstudent 1.343348e+00 1.038891e+00 1.733977e+00
## jobtechnician 8.491104e-01 7.212735e-01 1.000156e+00
## jobunemployed 8.245130e-01 6.358364e-01 1.065218e+00
## jobunknown 7.661636e-01 4.337011e-01 1.306873e+00
## maritalmarried 8.182866e-01 7.115278e-01 9.427170e-01
## maritalsingle 1.075195e+00 9.252878e-01 1.251220e+00
## educationsecondary 1.132777e+00 9.767253e-01 1.315680e+00
## educationtertiary 1.321804e+00 1.112378e+00 1.572376e+00
## educationunknown 1.159672e+00 9.089180e-01 1.475005e+00
## balance 1.058503e+00 1.029378e+00 1.088601e+00
## housingyes 5.093871e-01 4.598629e-01 5.640388e-01
## loanyes 6.883000e-01 5.946752e-01 7.943313e-01
## contacttelephone 8.971375e-01 7.490108e-01 1.071102e+00
## contactunknown 2.074275e-01 1.751554e-01 2.451975e-01
## day 1.013077e+00 1.007097e+00 1.019096e+00
## monthaug 5.987972e-01 4.976896e-01 7.206763e-01
## monthdec 2.305990e+00 1.532071e+00 3.468801e+00
## monthfeb 1.024354e+00 8.299547e-01 1.263833e+00
## monthjan 3.184947e-01 2.397521e-01 4.201825e-01
## monthjul 4.736439e-01 3.939787e-01 5.693281e-01
## monthjun 1.948718e+00 1.564093e+00 2.429038e+00
## monthmar 6.669112e+00 4.983695e+00 8.934010e+00
## monthmay 8.009444e-01 6.759731e-01 9.497209e-01
## monthnov 4.752064e-01 3.908233e-01 5.772624e-01
## monthoct 2.737726e+00 2.107409e+00 3.555813e+00
## monthsep 2.743455e+00 2.043301e+00 3.684981e+00
## duration 6.189700e+00 5.799857e+00 6.611876e+00
## campaign 9.362348e-01 9.137952e-01 9.584601e-01
## pdays 9.996322e-01 9.989000e-01 1.000360e+00
## poutcomeother 1.260022e+00 1.015951e+00 1.559498e+00
## poutcomesuccess 1.072117e+01 8.815222e+00 1.306562e+01
## poutcomeunknown 9.095883e-01 7.340954e-01 1.129013e+00
#Plotting Odds Ratio
plot_model(slr_fsr,sort.est = TRUE,grid = TRUE,title = 'Odds Ratio for Explanitory Variables:Stepwise LR')

anova(slr_fsr,slr_clean, test = "Chisq",class(slr_clean))
## Analysis of Deviance Table
##
## Model 1: y ~ job + marital + education + balance + housing + loan + contact +
## day + month + duration + campaign + pdays + poutcome
## Model 2: y ~ age + job + marital + education + default + balance + housing +
## loan + contact + day + month + duration + campaign + pdays +
## previous + poutcome
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 29910 14470
## 2 29907 14468 3 1.6833 0.6407
#cv model
relaxed_cv_model = cv.glmnet(train_expl,train_resp,family = 'binomial',relax = TRUE)
relaxed_best_lambda = relaxed_cv_model$lambda.min
#relaxed_best_lambda
#plot cv model
coef(relaxed_cv_model,lambda = relaxed_best_lambda)
## 17 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -9.056524739
## age .
## job .
## marital .
## education 0.179056934
## default .
## balance 0.095585138
## housing -1.080342743
## loan -0.655729480
## contact -0.520875956
## day .
## month .
## duration 1.654767784
## campaign -0.110197839
## pdays 0.002054007
## previous 0.073411054
## poutcome .
plot(relaxed_cv_model,lambda = relaxed_best_lambda)

#head(train_expl)
#use variables from cv
#more complex uses different cut-off
#creating a less "accurate model" but decreases the FNR as we want to determine who might default on a loan.
rlasso_pred = predict(relaxed_cv_model,test_expl,type = 'response',s = relaxed_best_lambda)
plot(rlasso_pred,col = test_resp,
main = 'Log Regression Predictions: Relaxed CV LASSO',
ylab = 'Prediction',
xlab = 'Data Index')

#use .1 for cut-offf
new_pred<-factor(ifelse(as.numeric(rlasso_pred)>.1,'yes','no') ,levels = c('no','yes'))
confusionMatrix(new_pred,test_resp)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 4654 130
## yes 1853 850
##
## Accuracy : 0.7351
## 95% CI : (0.725, 0.7451)
## No Information Rate : 0.8691
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3335
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7152
## Specificity : 0.8673
## Pos Pred Value : 0.9728
## Neg Pred Value : 0.3145
## Prevalence : 0.8691
## Detection Rate : 0.6216
## Detection Prevalence : 0.6390
## Balanced Accuracy : 0.7913
##
## 'Positive' Class : no
##
log.roc<-roc(response=test_resp,predictor=rlasso_pred,levels=c("no","yes"))
plot(log.roc,print.thres="best")
auc = auc(log.roc)
text(x = .4, y = .4,paste("AUC = ", round(auc,3), sep = ""))

anova(slr_fsr,relaxed_cv_model, test = "Chisq",class(relaxed_cv_model))
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 29949 22686
## job 11 534.0 29938 22152 < 2.2e-16 ***
## marital 2 80.1 29936 22072 < 2.2e-16 ***
## education 3 58.3 29933 22014 1.333e-12 ***
## balance 1 157.4 29932 21856 < 2.2e-16 ***
## housing 1 445.7 29931 21410 < 2.2e-16 ***
## loan 1 83.9 29930 21326 < 2.2e-16 ***
## contact 2 529.5 29928 20797 < 2.2e-16 ***
## day 1 31.9 29927 20765 1.607e-08 ***
## month 11 926.0 29916 19839 < 2.2e-16 ***
## duration 1 4361.0 29915 15478 < 2.2e-16 ***
## campaign 1 56.9 29914 15421 4.651e-14 ***
## pdays 1 74.7 29913 15346 < 2.2e-16 ***
## poutcome 3 876.9 29910 14470 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1